Purpose: compare normalization methods for a particular dataset
source(file.path("scripts", "CZI_functions.R"))
if (!("Rtsne" %in% installed.packages())) {
install.packages("Rtsne")
}
if (!("NMI" %in% installed.packages())) {
install.packages("NMI")
}
if (!("caret" %in% installed.packages())) {
install.packages("caret")
}
# Get the file names of all the normalized files
normalized.files <- dir("darmanis_data/normalized_darmanis")
# Read in each of the normalization files
normalized.data <- lapply(normalized.files, function(file) {
readr::read_tsv(file.path("darmanis_data/normalized_darmanis",
file))
})
## Parsed with column specification:
## cols(
## .default = col_double(),
## gene = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Genes = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Genes = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Genes = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Genes = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Genes = col_character()
## )
## See spec(...) for full column specifications.
# Keep all the gene lists
genes <- lapply(normalized.data, function(x) x[,1])
# Keep the names with it.
names(normalized.data) <- gsub("\\.tab|darmanis_", "", normalized.files)
# Read in the data
meta <- readr::read_tsv(file.path("darmanis_data", "GSE84465_meta.tsv"))
## Parsed with column specification:
## cols(
## .default = col_character(),
## channel_count = col_double(),
## taxid_ch1 = col_double(),
## contact_phone = col_double(),
## `contact_zip/postal_code` = col_double(),
## data_row_count = col_double(),
## `plate id:ch1` = col_double(),
## `tsne cluster:ch1` = col_double()
## )
## See spec(...) for full column specifications.
# Get sample names from columns
samples <- colnames(normalized.data[[1]])[-1]
# Keep metadata only for the samples we have
meta <- meta[match(samples, meta$geo_accession), ]
# Extra cell types info as it's own vector
cell.types <- as.factor(meta$`cell type:ch1`)
plate.batch <- as.factor(gsub("plate id: ", "", meta$`characteristics_ch1.1`))
# Run tsne on each dataset and extract
tsne <- lapply(normalized.data, function(dat) {
tsne.res <- Rtsne::Rtsne(t(dat[,-1]),
check_duplicates = FALSE)
tsne.res <- tsne.res$Y
names(tsne.res) <- samples
return(tsne.res)
})
# Make a function for plotting tsne's and using
tsne.plot <- function(dat, var, name = "name"){
# This function is to plot tsne data and label it by a variable
# Args:
# dat: a data.frame with two columns of data
# var: vector contains metadata labels
# name: a character string for the plot title and for to save as png
# Returns:
# png and plot in Rmd of the provided data with labels of the metadata provided
# convert celltype into numbers
colz <- colors(distinct = TRUE)[runif(length(levels(var)), min = 1,
max = length(colors(distinct=TRUE)))]
plot(dat,pch = 21, bg = colz[var], main = name);
legend(x = "bottomleft", legend = levels(var), fill = colz, cex = 0.6)
}
# Plot em with cell type and plate batch labels
lapply(tsne, function(dataset) {
# Get normalization method
set.name <- names(tsne)[parent.frame()$i[]]
# Make plots for cell type and plate batch
cell.type.plot <- tsne.plot(dataset, cell.types, name = set.name)
plate.batch.plot <- tsne.plot(dataset, plate.batch, name = set.name)
# Save the plots to pngs
png(paste0("tsne_", set.name, "cell_type.png"))
cell.type.plot
dev.off()
png(paste0("tsne_", set.name, "plate_batch.png"))
plate.batch.plot
dev.off()
# Print out the plots in the Rmd
cell.type.plot
plate.batch.plot
})












## $counts
## $counts$rect
## $counts$rect$w
## [1] 8.141351
##
## $counts$rect$h
## [1] 94.51097
##
## $counts$rect$left
## [1] -24.61422
##
## $counts$rect$top
## [1] 71.69175
##
##
## $counts$text
## $counts$text$x
## [1] -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946
## [8] -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946
## [15] -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946
## [22] -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946
## [29] -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946
## [36] -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946
## [43] -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946 -22.20946
## [50] -22.20946
##
## $counts$text$y
## [1] 69.8385972 67.9854409 66.1322846 64.2791283 62.4259719
## [6] 60.5728156 58.7196593 56.8665030 55.0133467 53.1601904
## [11] 51.3070341 49.4538778 47.6007215 45.7475652 43.8944089
## [16] 42.0412526 40.1880962 38.3349399 36.4817836 34.6286273
## [21] 32.7754710 30.9223147 29.0691584 27.2160021 25.3628458
## [26] 23.5096895 21.6565332 19.8033769 17.9502205 16.0970642
## [31] 14.2439079 12.3907516 10.5375953 8.6844390 6.8312827
## [36] 4.9781264 3.1249701 1.2718138 -0.5813425 -2.4344988
## [41] -4.2876552 -6.1408115 -7.9939678 -9.8471241 -11.7002804
## [46] -13.5534367 -15.4065930 -17.2597493 -19.1129056 -20.9660619
##
##
##
## $DESeq2
## $DESeq2$rect
## $DESeq2$rect$w
## [1] 6.615268
##
## $DESeq2$rect$h
## [1] 112.4473
##
## $DESeq2$rect$left
## [1] -25.8007
##
## $DESeq2$rect$top
## [1] 84.78113
##
##
## $DESeq2$text
## $DESeq2$text$x
## [1] -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671
## [8] -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671
## [15] -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671
## [22] -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671
## [29] -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671
## [36] -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671
## [43] -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671 -23.84671
## [50] -23.84671
##
## $DESeq2$text$y
## [1] 82.5762771 80.3714288 78.1665804 75.9617321 73.7568838
## [6] 71.5520355 69.3471872 67.1423389 64.9374906 62.7326423
## [11] 60.5277940 58.3229457 56.1180974 53.9132491 51.7084008
## [16] 49.5035525 47.2987042 45.0938559 42.8890076 40.6841593
## [21] 38.4793110 36.2744627 34.0696144 31.8647661 29.6599178
## [26] 27.4550695 25.2502212 23.0453729 20.8405246 18.6356763
## [31] 16.4308280 14.2259797 12.0211314 9.8162831 7.6114348
## [36] 5.4065865 3.2017382 0.9968899 -1.2079584 -3.4128067
## [41] -5.6176550 -7.8225033 -10.0273516 -12.2321999 -14.4370482
## [46] -16.6418965 -18.8467448 -21.0515931 -23.2564414 -25.4612897
##
##
##
## $log2
## $log2$rect
## $log2$rect$w
## [1] 8.306228
##
## $log2$rect$h
## [1] 90.09418
##
## $log2$rect$left
## [1] -26.15157
##
## $log2$rect$top
## [1] 64.74442
##
##
## $log2$text
## $log2$text$x
## [1] -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812
## [8] -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812
## [15] -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812
## [22] -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812
## [29] -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812
## [36] -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812
## [43] -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812 -23.69812
## [50] -23.69812
##
## $log2$text$y
## [1] 62.9778700 61.2113175 59.4447650 57.6782125 55.9116600
## [6] 54.1451075 52.3785550 50.6120025 48.8454500 47.0788975
## [11] 45.3123450 43.5457925 41.7792400 40.0126875 38.2461350
## [16] 36.4795825 34.7130300 32.9464775 31.1799250 29.4133725
## [21] 27.6468200 25.8802675 24.1137150 22.3471625 20.5806100
## [26] 18.8140575 17.0475049 15.2809524 13.5143999 11.7478474
## [31] 9.9812949 8.2147424 6.4481899 4.6816374 2.9150849
## [36] 1.1485324 -0.6180201 -2.3845726 -4.1511251 -5.9176776
## [41] -7.6842301 -9.4507826 -11.2173351 -12.9838876 -14.7504401
## [46] -16.5169926 -18.2835451 -20.0500976 -21.8166501 -23.5832026
##
##
##
## $scVLM
## $scVLM$rect
## $scVLM$rect$w
## [1] 8.859139
##
## $scVLM$rect$h
## [1] 84.06793
##
## $scVLM$rect$left
## [1] -31.52045
##
## $scVLM$rect$top
## [1] 58.87337
##
##
## $scVLM$text
## $scVLM$text$x
## [1] -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368
## [8] -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368
## [15] -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368
## [22] -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368
## [29] -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368
## [36] -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368
## [43] -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368 -28.90368
## [50] -28.90368
##
## $scVLM$text$y
## [1] 57.2249841 55.5765932 53.9282024 52.2798115 50.6314206
## [6] 48.9830297 47.3346388 45.6862480 44.0378571 42.3894662
## [11] 40.7410753 39.0926845 37.4442936 35.7959027 34.1475118
## [16] 32.4991209 30.8507301 29.2023392 27.5539483 25.9055574
## [21] 24.2571666 22.6087757 20.9603848 19.3119939 17.6636030
## [26] 16.0152122 14.3668213 12.7184304 11.0700395 9.4216487
## [31] 7.7732578 6.1248669 4.4764760 2.8280851 1.1796943
## [36] -0.4686966 -2.1170875 -3.7654784 -5.4138693 -7.0622601
## [41] -8.7106510 -10.3590419 -12.0074328 -13.6558236 -15.3042145
## [46] -16.9526054 -18.6009963 -20.2493872 -21.8977780 -23.5461689
##
##
##
## $TMM
## $TMM$rect
## $TMM$rect$w
## [1] 9.576485
##
## $TMM$rect$h
## [1] 91.43549
##
## $TMM$rect$left
## [1] -33.33695
##
## $TMM$rect$top
## [1] 65.31656
##
##
## $TMM$text
## $TMM$text$x
## [1] -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829
## [8] -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829
## [15] -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829
## [22] -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829
## [29] -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829
## [36] -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829
## [43] -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829 -30.50829
## [50] -30.50829
##
## $TMM$text$y
## [1] 63.5237050 61.7308521 59.9379993 58.1451465 56.3522937
## [6] 54.5594409 52.7665881 50.9737352 49.1808824 47.3880296
## [11] 45.5951768 43.8023240 42.0094711 40.2166183 38.4237655
## [16] 36.6309127 34.8380599 33.0452071 31.2523542 29.4595014
## [21] 27.6666486 25.8737958 24.0809430 22.2880902 20.4952373
## [26] 18.7023845 16.9095317 15.1166789 13.3238261 11.5309733
## [31] 9.7381204 7.9452676 6.1524148 4.3595620 2.5667092
## [36] 0.7738563 -1.0189965 -2.8118493 -4.6047021 -6.3975549
## [41] -8.1904077 -9.9832606 -11.7761134 -13.5689662 -15.3618190
## [46] -17.1546718 -18.9475246 -20.7403775 -22.5332303 -24.3260831
##
##
##
## $voom
## $voom$rect
## $voom$rect$w
## [1] 8.410643
##
## $voom$rect$h
## [1] 83.74553
##
## $voom$rect$left
## [1] -27.94895
##
## $voom$rect$top
## [1] 58.74276
##
##
## $voom$text
## $voom$text$x
## [1] -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465
## [8] -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465
## [15] -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465
## [22] -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465
## [29] -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465
## [36] -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465
## [43] -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465 -25.46465
## [50] -25.46465
##
## $voom$text$y
## [1] 57.1006897 55.4586204 53.8165511 52.1744818 50.5324125
## [6] 48.8903432 47.2482739 45.6062046 43.9641353 42.3220659
## [11] 40.6799966 39.0379273 37.3958580 35.7537887 34.1117194
## [16] 32.4696501 30.8275808 29.1855115 27.5434422 25.9013728
## [21] 24.2593035 22.6172342 20.9751649 19.3330956 17.6910263
## [26] 16.0489570 14.4068877 12.7648184 11.1227491 9.4806797
## [31] 7.8386104 6.1965411 4.5544718 2.9124025 1.2703332
## [36] -0.3717361 -2.0138054 -3.6558747 -5.2979440 -6.9400134
## [41] -8.5820827 -10.2241520 -11.8662213 -13.5082906 -15.1503599
## [46] -16.7924292 -18.4344985 -20.0765678 -21.7186371 -23.3607065
kmeans_eval <- function(feature, metadata = metadata, iter = 10, seed =1234) {
# This function is used to perform iterative k-means clustering based on projected
# features for single cell data and then evaluate the performance according to
# Nomalized mutual information (NMI) and adjusted rand index (ARI)
#
# Args:
# feature: a data.frame contains projected features (n dimensional space, n = 2, 3 ...), the columns are
# projected features in n dimensional space for a sample (cell), rows are samples
# celltype: vector contains cell type informantion
# iter: number of interation for k-means clustering
# seed: seed for k-means clustering
# Returns:
# NMI and ARI results
# convert celltype into numbers
metadata_num <- as.numeric(factor(metadata))
sample_id <- seq(1:nrow(feature))
# iterative k-means
nmi_score_all <- c()
ari_score_all <- c()
all_cluster <- list()
for(i in 1:iter){
# set k equal to the number of celltypes in the dataset
k <- length(unique(metadata))
# perform k means clustering
km <- kmeans(feature, k)
# true clusters
orignal_data <- data.frame(sample_id, metadata_num)
# predicted clusters
cl_data <- data.frame(sample_id, km$cluster)
# calculate NMI and ARI score
nmi_score <- NMI::NMI(orignal_data, cl_data)$value
ari_score <- mclust::adjustedRandIndex(km$cluster, metadata_num)
nmi_score_all <- c(nmi_score_all, nmi_score)
ari_score_all <- c(ari_score_all, ari_score)
}
# Compile all results into a data.frame
results <- data.frame(ari = ari_score_all, nmi = nmi_score_all)
return(results)
}
knn_eval <- function(feature, metadata = metadata, k = 10){
# This function performs knn based evaluation
# Args:
# feature: a data.frame contains projected features (n dimensional space, n = 2, 3 ...), the columns are
# projected features in n dimensional space for a sample (cell), rows are samples
# celltype: vector contains cell type information
# k: cross validation fold
# Returns:
# list of accuracy scores for each iteration of KNN
# Make the data into a data.frame:
feature <- data.frame("tsne" = feature, "metadata" = metadata)
# Split observations into groups
cv <- cvTools::cvFolds(nrow(feature), K = k, R = 1)
# Create empty objects to store the performance information for each iteration
perf.eval <- list()
confusion.matrix <- 0
# Go through this iteration that k times
for (i in 1:k) {
# Isolate samples for training the model
train <- feature[cv$subsets[-which(cv$which == i)], ]
# Isolate samples for testing the model
test <- feature[cv$subsets[which(cv$which == i)], ]
# Perform KNN model fitting
knn.fit <- caret::train(metadata~. , data = train, method = "knn",
trControl = caret::trainControl(method = "cv", number = 3),
preProcess = c("center", "scale"),
tuneLength = 10)
# Evaluate the model
knn.pred <- predict(knn.fit, newdata = subset(test, select = -c(metadata)))
perf.eval[[i]] <- round(cal_performance(knn.pred, test$metadata, 3), 2)
# Make the results into a matrix
matrix <- as.matrix(table(test$metadata, knn.pred, deparse.level = 0))
confusion.matrix <- matrix + confusion.matrix
}
# Get mean performance of cross validation
perf.eval <- dplyr::bind_rows(perf.eval)
accuracy <- perf.eval$accuracy
return(data.frame("knn" = accuracy))
}
# Get knn and kmeans results for all tsne's of all datasets
cell.type.results <- lapply(tsne, function(dataset) {
knn.results <- knn_eval(dataset, metadata = cell.types)
kmeans.results <- kmeans_eval(dataset, metadata = cell.types)
data.frame(knn.results, kmeans.results)
})
## Loading required package: lattice
## Loading required package: ggplot2
# Get knn and kmeans results for all tsne's of all datasets
batch.results <- lapply(tsne, function(dataset) {
# knn.results <- knn_eval(dataset, metadata = plate.batch)
kmeans.results <- kmeans_eval(dataset, metadata = plate.batch)
# data.frame(knn.results, kmeans.results)
data.frame(kmeans.results)
})
plot.results <- function(results.list, name = "results") {
# This function makes a boxplot of the cluster statistic results
# Args:
# results.list: a list of dataframes which each column contains a different
# statistic for 10 iterations
# name: name to use for the png to be saved and the plot title
# Returns: boxplots of the normalization method cluster statistics. Prints the
# plots and save the plot as a png
# Get meta info
meta <- names(unlist(results.list))
# Transform list into a dataframe:
ggplot.df <- data.frame("method" = stringr::word(meta, 1, sep = "\\."),
"test" = gsub("[0-9]*$", "",
stringr::word(meta, 2, sep = "\\.")),
"iter" = rep(1:nrow(results.list[[1]]),
length(results.list)*ncol(results.list[[1]])),
"values" = unlist(results.list))
# Make the plot
plot <- ggplot(data = ggplot.df, aes(x = method, y = values, fill = test)) +
geom_boxplot(position = position_dodge()) +
xlab("Normalization method") +
ggtitle(name) +
facet_wrap(~test)
# Save plot to png
ggsave(paste0(name, "cluster_results.png"), width = 10)
# Print plot
plot
}
# Plot the cell type results
plot.results(cell.type.results, name = "Cell_type")
## Saving 10 x 5 in image

# Plot the plate batch results
plot.results(batch.results, name = "Plate_batch")
## Saving 10 x 5 in image

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin17.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libLAPACK.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] caret_6.0-81 ggplot2_3.1.0 lattice_0.20-35
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 purrr_0.2.5 reshape2_1.4.3
## [4] splines_3.5.1 colorspace_1.3-2 generics_0.0.2
## [7] stats4_3.5.1 htmltools_0.3.6 yaml_2.2.0
## [10] prodlim_2018.04.18 survival_2.42-6 rlang_0.3.0.1
## [13] e1071_1.7-0 ModelMetrics_1.2.2 pillar_1.3.0
## [16] NMI_2.0 withr_2.1.2 glue_1.3.0
## [19] bindrcpp_0.2.2 foreach_1.4.4 plyr_1.8.4
## [22] bindr_0.1.1 lava_1.6.4 robustbase_0.93-3
## [25] stringr_1.3.1 timeDate_3043.102 munsell_0.5.0
## [28] gtable_0.2.0 recipes_0.1.4 codetools_0.2-15
## [31] evaluate_0.12 labeling_0.3 knitr_1.20
## [34] class_7.3-14 DEoptimR_1.0-8 Rcpp_0.12.19
## [37] readr_1.3.0 scales_1.0.0 backports_1.1.2
## [40] ipred_0.9-8 hms_0.4.2 digest_0.6.18
## [43] stringi_1.2.4 Rtsne_0.15 dplyr_0.7.7
## [46] grid_3.5.1 rprojroot_1.3-2 tools_3.5.1
## [49] magrittr_1.5 lazyeval_0.2.1 tibble_1.4.2
## [52] crayon_1.3.4 cvTools_0.3.2 pkgconfig_2.0.2
## [55] MASS_7.3-51 Matrix_1.2-14 data.table_1.11.8
## [58] lubridate_1.7.4 gower_0.1.2 assertthat_0.2.0
## [61] rmarkdown_1.10 iterators_1.0.10 mclust_5.4.1
## [64] R6_2.3.0 rpart_4.1-13 nnet_7.3-12
## [67] nlme_3.1-137 compiler_3.5.1